package com.chance.cc.crawler.development.bootstrap.amazon;

import com.alibaba.fastjson.JSON;
import com.chance.cc.crawler.core.CrawlerEnum;
import com.chance.cc.crawler.core.downloader.proxy.Proxy;
import com.chance.cc.crawler.core.filter.FilterUtils;
import com.chance.cc.crawler.core.record.CrawlerRecord;
import com.chance.cc.crawler.core.record.CrawlerRequestRecord;
import com.chance.cc.crawler.development.controller.DevCrawlerController;
import org.apache.commons.lang3.RandomUtils;
import org.apache.commons.lang3.StringUtils;

import java.util.ArrayList;
import java.util.List;

import static com.chance.cc.crawler.core.CrawlerEnum.CrawlerRequestType.*;
import static com.chance.cc.crawler.core.CrawlerEnum.CrawlerRequestType.internalDownload;

public class AmazonStart {
    private static final String domain = "amazon";
    public static final String site = "commodity";

    private static Proxy proxy = new Proxy();
    static {
        //代理配置
        //H5168QRFNIU3804D
        //5F6B3610BB719FAA
        proxy.setHost("http-dyn.abuyun.com");
        proxy.setPort(9020);
        proxy.setUsername("HL89Q19E86E2987D");
        proxy.setPassword("71F33D94CE5F7BF2");
    }

    public static void main(String[] args) {

        CrawlerRequestRecord crawlerRequestRecord=news();

        CrawlerRequestRecord keywordRecord = CrawlerRequestRecord.builder()
                .startPageRequest("amazon_series_keyword",turnPageItem)
                .httpUrl("http://192.168.1.217:9599/v1/meta/amazon/keys?site=commodity")
                .requestLabelTag(supportSource)
                .requestLabelTag(internalDownload)
                .build();

        DevCrawlerController devCrawlerController = DevCrawlerController.builder()
                .triggerInfo(domain, domain + "_trigger", System.currentTimeMillis(), domain + "_job")
                .crawlerRequestQueue(DevCrawlerController.devRequestQueue(domain))
                .consoleResultPipeline("redis")
                .consoleResultPipeline("kafka")
                .fileResultPipeline("redis","D:\\chance_log\\亚马逊redis-8-3_1.log",false)
                .fileResultPipeline("kafka","D:\\chance_log\\亚马逊Kafka-8-3_1.log",false)
                .crawlerThreadNum(30)
                .supportRecord(keywordRecord)
                .requestRecord(crawlerRequestRecord)
                .build("com.chance.cc.crawler.development.scripts.amazon");
        devCrawlerController.start();
    }

    public static CrawlerRequestRecord news(){
//        String url = "https://www.amazon.cn/s?k=nike&page=1";
        String url = "https://www.amazon.cn";

        CrawlerRequestRecord crawlerRequestRecord = CrawlerRequestRecord.builder()
                .startPageRequest(domain, CrawlerEnum.CrawlerRequestType.turnPage)
                .domain(domain)
                .httpUrl(url)
                .recordKey(url)
                .releaseTime(System.currentTimeMillis())
                .filter(CrawlerEnum.CrawlerRecordFilter.keyOrDateRange)
                .addFilterInfo(FilterUtils.memoryFilterKeyInfo(domain))
                .addFilterInfo(FilterUtils.dateRangeFilterInfo(24*365, null))
                .resultLabelTag(CrawlerEnum.CrawlerDataType.article)
                .resultLabelTag(CrawlerEnum.CrawlerDataType.comment)
                .proxy(proxy)
                .httpHead("Connection","keep-alive")
                .httpHead("Cache-Control","max-age=0")
                .httpHead("rtt","100")
                .httpHead("downlink","10")
                .httpHead("ect","4g")
                .httpHead("sec-ch-ua","\"Google Chrome\";v=\"93\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"93\"")
                .httpHead("sec-ch-ua-mobile","?0")
                .httpHead("Upgrade-Insecure-Requests","1")
                .httpHead("User-Agent",getRandomUA())
                .httpHead("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
                .httpHead("Sec-Fetch-Site","same-origin")
                .httpHead("Sec-Fetch-User","?1")
                .httpHead("Sec-Fetch-Dest","document")
                .httpHead("Accept-Language","zh-CN,zh;q=0.9")
                .httpHead("Cookie","session-id=462-8168812-7024355; i18n-prefs=CNY; lc-acbcn=zh_CN; ubid-acbcn=460-7097111-9030419; session-token=ugupkmxReZDKiKU9dL2bTGGOPx7wT4YHSOqf8ABSKwOEdq0RIJRHKMf4phDM7NatJN8aKJR16euN4hvm/azrL/N0PpnggNhDcOs51w65DOCWe2SNI9Li/NwcPUe6lzCxB6U76vQRSCWSWGtFaPSqeUiBGWOfULVJ6Hi0qYFfXpVn61b1Ne6k4q8Ez/wi4uy7; session-id-time=2082787201l; csm-hit=tb:X1YHY8ZT6ANH2MPSB6AF+s-HHNT7Y9CTWD0RJF70XG5|1634023110426&t:1634023110427&adb:adblk_no")
                .build();
        crawlerRequestRecord.tagsCreator().bizTags().addDomain(domain);
        crawlerRequestRecord.tagsCreator().bizTags().addSite(site);

        //添加评论去重信息
        CrawlerRecord filterCrawlerRecord = new CrawlerRecord();//过滤爬虫记录
        filterCrawlerRecord.setFilter(CrawlerEnum.CrawlerRecordFilter.keyOrDateRange);//根据key和时间过滤
        filterCrawlerRecord.addFilterInfo(FilterUtils.memoryFilterKeyInfo(StringUtils.joinWith("-", filter, domain, "comment")));//内存过滤
        filterCrawlerRecord.addFilterInfo(FilterUtils.dateRangeFilterInfo(24 * 365, null));//时间范围过滤
        crawlerRequestRecord.tagsCreator().bizTags().addCustomKV("comment_record_filter_info", JSON.toJSONString(filterCrawlerRecord));//自定义标签

        return crawlerRequestRecord;
    }




    private static List<String> agentList = new ArrayList<>();

    static {
        agentList.add("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36");
        agentList.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36");
        agentList.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36");
        agentList.add("Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36");
        agentList.add("Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko");
        agentList.add("Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko");
        agentList.add("Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0");
        agentList.add("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 7.0; InfoPath.3; .NET CLR 3.1.40767; Trident/6.0; en-IN)");
        agentList.add("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)");
        agentList.add("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)");
        agentList.add("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)");
        agentList.add("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/4.0; InfoPath.2; SV1; .NET CLR 2.0.50727; WOW64)");
        agentList.add("Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)");
        agentList.add("Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)");
        agentList.add("Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) ChromePlus/4.0.222.3 Chrome/4.0.222.3 Safari/532.2");
        agentList.add("Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.28.3 (KHTML, like Gecko) Version/3.2.3 ChromePlus/4.0.222.3 Chrome/4.0.222.3 Safari/525.28.3");
        agentList.add("Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16");
        agentList.add("Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14");
        agentList.add("Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14");
        agentList.add("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14");
        agentList.add("Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02");
        agentList.add("Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00");
        agentList.add("Opera/9.80 (Windows NT 5.1; U; zh-sg) Presto/2.9.181 Version/12.00");
        agentList.add("Opera/12.0(Windows NT 5.2;U;en)Presto/22.9.168 Version/12.00");
        agentList.add("Opera/12.0(Windows NT 5.1;U;en)Presto/22.9.168 Version/12.00");
        agentList.add("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1");
        agentList.add("Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0");
        agentList.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0");
        agentList.add("Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0");
        agentList.add("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0");
        agentList.add("Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0");
        agentList.add("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.13 Safari/537.36");
        agentList.add("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3756.400 QQBrowser/10.5.4043.400");
    }

    private static String getRandomUA() {
        return agentList.get(RandomUtils.nextInt(0, agentList.size() - 1));
    }
}

